####################################### DATA ANALYSIS WITH R #######################################

#
## NUMERICAL SUMMARY
#

possum<-read.table("possum.txt",header=TRUE)
attach(possum)
head(possum, n=3)                             # display the first three rows
str(possum)                                   # get information on each of the columns
dim(possum)                                   # 104 obs, 14 variables
possum[!complete.cases(possum),]              # which rows have missing values in the data 
range(earconch)                               # min and max values of ear conch length
mean(earconch)                                # sample mean 
mean(earconch,trim=0.95)                      # trimmed mean (the first 5% smaller and 5% larger 
                                              # observations are omitted in computing 
                                              # the mean, robust to the presence of outliers)
mean(footlgth)                                # [1] NA, there is a not available value
mean(footlgth,na.rm=T)                        # omit NA values from the computation
median(earconch)                              # median
var(earconch)                                 # sample variance
sum((earconch-mean(earconch))^2)/(length(earconch)-1)
                                              # the same
sqrt(var(earconch));                          # standard deviation 
sd(earconch)                                  # the same
quart<-quantile(earconch,probs=c(0.25,0.75))  # quartiles
quart
sd(earconch);0.75*(quart[2]-quart[1])         # s~0.75H suggests normality 
summary(earconch)                             # range, mean, median and quartiles
detach(possum)

summary(possum)                               # see what generates for 'Pop' and 'sex'

#
## GRAPHICAL SUMMARY
#

#
## Histograms and density plots
#

possum.f<-possum[possum$sex=="f",]            # create a subset data frame with female possums
                                              # same as 'possum[which(possum$sex=="f"),]'
head(possum.f,n=3)                              # equivalent to data frame 'fossum'
dim(possum.f)                                 # 43 obs

attach(possum.f) 
hist(totlngth, probability=TRUE)              # histogram of total length of female possums (relative frequency and not absolute)
hist(totlngth, prob=TRUE,breaks=10)           # change the number of breakpoints
                                              # "prob=TRUE" works as well as "probability=TRUE"

# how to change the breakpoints 

range(totlngth)                               # min=75.0 max=96.5

par(mfrow=c(2,1))                             # to get a 2 by 1 layout

hist(totlngth, breaks=72.5+(0:5)*5,           # A: specify breaks at 72.5,77.5,...
     prob=TRUE,xlab="Total length (cm)",      # put legends on x-axis
     main="A: breaks at 72.5,77.5,...")       # put a title
#mtext("A: breaks at 72.5,77.5,...",side=3)   
                                              # see the difference with main=""
hist(totlngth, breaks=75+(0:5)*5,             # B: specify breaks at 75,80,...
     prob=TRUE,xlab="Total length (cm)", 
     main="B: breaks at 75,80,...")
#mtext("A: breaks at 72.5,77.5,...",line=3,side=3)   
                                              # see the difference with main=""

                                              # plot A suggests that the distribution is
                                              # asymmetric (skewed to the left),
                                              # while plot B suggests symmetry

dens<-density(totlngth)                       # density estimate, depends on bandwidth parameter
x.lim<-range(dens$x);y.lim<-range(dens$y)     # to be used as arguments of 'hist'
hist(totlngth, breaks=72.5+(0:5)*5,           # C: same as histogram A
     prob=TRUE,xlim=x.lim,ylim=y.lim,
     xlab="Total length (cm)", 
     main="C: breaks as in A")
lines(dens)                                   # add density plot

hist(totlngth, breaks=75+(0:5)*5,             # D: same as histogram B
     prob=TRUE,xlim=x.lim,ylim=y.lim,
     xlab="Total length (cm)", 
     main="D: breaks as in B")
lines(dens)                                   # add density plot
                                              # a density plot is more reliable in evaluating the 
                                              # shape of the distribution

par(mfrow=c(1,1))                             # back to 1 by 1 layout

detach(possum.f)

# LATTICE style density plot (OPTIONAL) 
                                              # useful for providing graphs that allow ready
                                              # comparison between grouping in the data 
library(lattice)                              

trellis.device(color=FALSE)                   # open the device, no color                    
densityplot(~earconch | Pop, groups=sex,      # density plot that compares 'earconch' of
  data=possum, pch=c(1,4),                    # male and female possums conditioned 
  auto.key=list(columns=2,points=TRUE),       # on the levels of the variable Pop (factor) 
  aspect=1)                                   # auto.key is for the legend, aspect controls   
                                              # physical aspect ratio of the panels

#trellis.device(color=FALSE)                  # try this, less options specified                    
#densityplot(~earconch | Pop,groups=sex,data=possum)

help(densityplot)                             # see the help if interested

#
## Box plots
#

#possum.f<-subset(possum, sex=="f")            

old.par <- par(no.readonly = TRUE)            # all graphical parameters settings
                                              # that could be changed

par(fig=c(0, 1, 0, 0.5))                      # coordinates of the figure region in the display
                                              # region of the device:  0 to 50% of 
                                              # height of figure region
boxplot(possum.f$totlngth, horizontal=TRUE,   # box plot of total length of female possums
        xlab="Total length (cm)",             # horizontal=FALSE is the default
        main="Box plot")
rug(possum.f$totlngth)                        # add data points on the x-axis
par(fig=c(0, 1, 0.5, 1),new=TRUE)             # 50% to 100% of height of figure region, 
                                              # specify 'new=TRUE' since we use the same device
                                              # (counterintuitive indeed, see 'help(par)')   
possum.m<-possum[possum$sex=="m",]            # create a subset data frame with male possums
boxplot(possum.m$totlngth, horizontal=TRUE,   # box plot of total length of male possums
        xlab="Total length (cm)",
        main="Box plot")
rug(possum.m$totlngth)                        # add data points on the x-axis

par(fig=c(0, 1, 0.2, 0.8))                    # 20% to 80% of height of figure region
boxplot(totlngth~sex, data=possum,            # data values are split into groups according 
        horizontal=TRUE,                      # to the grouping variable sex (a factor)
        xlab="Total length (cm)",             # display one box plot for each group
        main="Box plots by sex")              # labels on y-axis taken from the levels of sex
rug(possum.f$totlngth,side=1)                 # add data points on the x-axis
rug(possum.m$totlngth,side=3)                 # add data points on the top-horizontal axis

#boxplot(split(possum$totlngth,possum$sex),   # do the same 
#        horizontal=TRUE)                     # 'split' divides an object according to levels in
                                              # a factor, no legend on the x-axis

x<-possum.f$totlngth                          
y<-possum.m$totlngth
par(fig=c(0.2, 0.8, 0, 1))                    # 20% to 80% of width of figure region
boxplot(x,y,xlab="Total length (cm)",         # the same as before, but box plots now 
        main="Box plot by sex")               # are displayed vertically (default)           
rug(x,side=2);rug(y,side=4)                   # add data points on the axes
                                              # no legend on the y-axis
                                              
par(old.par)                                  # restore the previous graphical settings

plot( x,rep(0,length(x)),ylim=c(0,1),         # a simple alternative to display 
     xlab="Total length (cm)",ylab="")        # the two sub samples
points( y,rep(1,length(y)) ) 
#plot(c(x,y),rep(0:1,c(length(x),length(y))), # do the same more concisely 
#     xlab="Total length (cm)",ylab="")        
     
plot(as.numeric(sex)~totlngth,data=possum)    # do the same, points at 1 and 2 this time
unclass(possum$sex)                           # remember how a factor is coded in R


#
## PROBABILITY DISTRIBUTIONS
#

rbinom(10,size=1,prob=.5)                     # 10 Bernoulli trials
round( dpois(0:4,lambda=3), 2)                # Prob(X=x), x=0,1,2,3,4  X~Pois(3)
round( ppois(0:4,lambda=3), 2)                # Prob(X<=n), X~Pois(3)  (CUMULATIVE DISTRIBUTION FUNCTION)
round( cumsum( dpois(0:4,lambda=3) ), 2)      # the same
y<- rexp(50,rate=2)                           # 50 variates from exponential variable 
                                              # of rate 2, Exp(2) (by default 'rate=1') 
summary(y)                                    # mean approximatively equal to 1/rate
hist(y,prob=TRUE,breaks=10,                   # histogram of y
     main="Histogram of exponential variates")# 'breaks' used for specifying the number of bins     
z<-seq(0,3,length=31)                         # 31 equaly spaced points in [0,3]
lines(z,dexp(z,2))                            # add the density of Exp(2)

1-pnorm(1.644)                                # Prob(X>1.644),      X~Norm(0,1)
qexp(0.75,2)                                  # q: Prob(X<=q)=0.75, X~Exp(2)

#
## Normal Distribution
#

x<- rnorm(100)                                # 100 variates from standard normal Norm(0,1)
                                              # (by default 'mean=0' and 'sd=1')
summary(x)                                    # mean approximatively equal to 1,
quart<-summary(x)[c(2,5)]                     # quartiles
sd(x);0.75*(quart[2]-quart[1])                # s~0.75H suggests normality

hist(x,prob=TRUE)                             # histogram, shoul look symmetric and bell-shaped

                                              # PLOT THE DENSITY FUNCTION N(0,1)
z<-seq(-3,3,length=31)                        # 31 equally spaced points in [-3,3]    
ht<-dnorm(z)                                  # density function evaluated a these points
plot(z,ht,type="l",xlab="",ylab="Density",    # plot the normal density, in the range (-3,3)
     main="Norm(0,1)",yaxs="i")               # 'yaxs="i"' locates the axes at the limits
                                              # of the data
hist(x,prob=TRUE,ylim=range(ht),              # specify y-axis range to include the density 
     main="Histogram of normal variates")                             
lines(z,ht)                                   # add the normal density to the histogram

                                              # Additional examples
pnorm(0)                                      # 0.5, exactly half the area under the density  
                                              # is to the left of the mean 
pnorm(1)                                      # .841 
pnorm(-1.96)                                  # .025 
pnorm(1.96)                                   # .975 
pnorm(1.96, mean=2)                           # .484 for a normal distribution with mean=2, sd=1 
pnorm(1.96, sd=2)                             # .836 for a normal distribution with mean=0, sd=2 
qnorm(.9)                                     # 90th percentile; mean=0 and sd=1 
qnorm(0.841)                                  # 1.0 
qnorm(0.5)                                    # 0 
qnorm(0.975)                                  # 1.96 
qnorm(c(.1,.2,.3))                            # -1.282 -0.842 -0.524, 
                                              # i.e. 10th, 20th and 30th percentiles 
qnorm(.1, mean=100, sd=10)                    # 87.2 10th percentile, mean=100, sd=10 

#pdf("fig.pdf",paper="special",               # save the plot in a pdf file 
#     height=6,width=6)

old.par <- par(no.readonly = TRUE)            #OPTIONAL PLOT             
par(mar=c(4.1,4.1,1.1,1.1), mgp=c(2.5,0.5,0), 
    cex.axis=0.85)                            
z <- seq(-3,3,length=101)
plot(z, dnorm(z), type="l", 
     xlab="",ylab="", yaxs="i", bty="L")
title(xlab=list("z",cex=1.5),
      ylab=list("normal density",cex=1.5))
polygon(c(z[z <= 1.0],1.0),c(dnorm(z[z <= 1.0]), dnorm(-3)), col="grey")
chh <- par()$cxy[2]
arrows(-1.8, 0.32, -0.25, 0.2, length=0.07, xpd=T)
cump <- round(pnorm(1), 3)
text(-1.8, 0.32+0.75*chh, paste("pnorm(1)\n", "=", cump), xpd=T, cex=1.2)
par(old.par)                                  # restore the previous graphical settings

#dev.off()

#
## Poisson Distribution
#

k<-0:15                                       # [1] 0 1 ... 14 15
l<-5                                          # Poisson rate=5 (also mean of the distribution)
p<-dpois(k,lambda=l)                          # exact probabilities at k
round(p,3)

#pdf("fig.pdf",paper="special",               # save the plot in a pdf file
#    height=6,width=6)

plot(k,p,type="h",lwd=5,ylim=c(0,0.25),       # ' type="h" ' for histogram like vertical lines 
     xlab="",ylab="",main="")                 # specify axes legends and title later
y.string<-expression(                         # string to be used as ylab
        paste("Poisson prob ",lambda==5)      # the text is mathematical expression formatted 
        )                                     # according to TEX-like rules ('==' is the syntax 
                                              # for equals) see 'help(plotmath)'
title(xlab=list("k",cex=1.5),                 # 'title' takes lists as arguments
      ylab=list(y.string,cex=1.5))            # specify the font size by 'cex'

#dev.off()

# Histograms from simulated data for n=10,100,500,1000
# use of a loop + conditional evaluation + add a legend to an existing plot 


kfin<-17                                      # upper limit for the distribution 
n<-c(10,100,500,1000)                         # sample sizes
y.string<-expression(paste("poisson prob ",
                           lambda==5))
t.string<-paste("n=",n)                       # 'paste' is used to create four different titles 
                                              # with the sample size specified

k<-0:15;l<-5;p<-dpois(k,lambda=l)
plot(k,p,type="h",lwd=5,ylim=c(0,0.25),       # histogram-like plot of the true 
     xlab="",ylab="",main="")                 # distribution Pois(5)
j=1
title(main=list(t.string[j],cex=1.4),         # 't.string[j]' contains "n=n[j]", the title 
      xlab=list("k",cex=1.5),                 # x-axis legend "k"
      ylab=list(y.string,cex=1.5))            # y-axis legend "poisson prob lamda=5" 
r<-rpois(n[j],lambda=5)                       # simulate n[j] Poisson variates of rate 5
out<-hist(r,breaks=c(-1:kfin),plot=F)         # store the output of the calling 'hist'
                                              # we want the relative frequency at 0:17
                                              # so we specify the breaks as -1:17, 
                                              # then we recover
                                              # the component 'density' from the list 'out' 
for(i in 0:kfin) {                            # loop for plotting the vertival lines at each
                                              #  value in 0:17 with eight = relative frequency
  if(out$density[i+1]>0) {                    # check that the relative frequency is non null
    lines(c(i+0.2,i+0.2),                     # if so, plot the vertical lines
          c(0,out$density[i+1]),lwd=5,col=2)  # a little to the right
  }
}                                             
legend(8,.2,c("true prob.","histogram"),      # add a legend
       lty=c(1,1),col=c(1,2),lwd=c(5,5))    

old.par <- par(no.readonly = TRUE)

par(mfrow=c(2,2))                             # to get a 2 by 2 layout with the 4 plot for
                                              # n=10, 100, 500, 1000

for(j in 1:4) {                               # external loop for drawing the four plots
  plot(k,p,type="h",lwd=5,ylim=c(0,0.25),xlab="",ylab="",main="") 
  title(main=list(t.string[j],cex=1.4),xlab=list("k",cex=1.5),ylab=list(y.string,cex=1.5))            
  r<-rpois(n[j],lambda=5)                     
  out<-hist(r,breaks=c(-1:kfin),plot=F)
  for(i in 0:kfin) {                          # internal loop
    if(out$density[i+1]>0) {                   
      lines(c(i+0.2,i+0.2),c(0,out$density[i+1]),lwd=5,col=2)
    }
  }                                           # close internal loop
  legend(8,.2,c("true prob.","histogram"),lty=c(1,1),col=c(1,2),lwd=c(5,5))
}                                             # close external loop
par(old.par)                                  # restore the previous graphical settings
                                              # i.e. back to 1 by 1 layout
#
## Iteration
#

x<- (1:4000)*(pi/40);                         # evaluate the sin of each element of 'x'
y<-rep(0,length=length(x))                    # and store them into the vector 'y' 
for(i in 1:length(x)) {                       # loop, i serves as index for y[] and x[]
	y[i]<-sin(x[i]) 
	}     
y
for(i in 1:length(x)) y[i]<-sin(x[i])         # the same (without {...}, only 1 command)
y
y<- sin(x)                                    # the same but faster, try 400000 instead of 4000
y

#
## Conditional Evaluation
#

                                              # simple example of 'if' statement
x<-rpois(1,5)                                 # x is a value from Pois(5)
if (x>3) {                                    
  y<-1                                        
  z<-2                                        # (y,z)=(1,2) if x>3
} else {
  y<-2
  z<-1 }                                      # (y,z)=(1,2) if x<=3
x;y;z

x<-rpois(1,5)                                 # example of 'if' statements that are nested
count<-0                                      # count=1  if 2<x<5
if (x>2) {                                    # count=-1 if x<=2 or x>=5
  if (x<5) {
  	count<-count+1
  } else count<-count-1
}
x;count

count<-0                                      # this leads a syntax error: 
if (x>2)                                      # there is a new line between the command 
  if (x<5) count<-count+1                     # in the 'if'-part and the 'else'-part
   else {
     count<-count-1
   }

count<-0                                      # this is ok 
if (x>2)                                      # no {} are used to delimit branches
  if (x<5) 
    count<-count+1 else  count<-count-1
x;count

x<-rpois(1,5)
count<-0                                      # this is ok
if (x>2) {                                    # here we use {} to delimit branches
  if (x<5) {
    count<-count+1 
  } else {
      count<-count-1
    }
}
x;count

x<-rpois(1,5)
count<-0                                      # compound conditions
if (x>2 & x<5){                               # does the same as before
  count<-count+1
} else count<-count-1
x;count

#
## Random Number Generation
#

set.seed(1)
runif(10)
set.seed(1)
runif(10)                                     # the same
runif(10)                                     # different

#
## QQ Plot
#

y<- rexp(50,2) 
ppoints(y)                                    # sequence of ìprobabilityî points 
                                              # (1:m - a)/(m + (1-a)-a) where m is length(x)
seq(0,1,length=length(y))                     # note the difference, this are all equi-spaced

plot(qexp(ppoints(y),1),sort(y))              # plot the empirical quantiles against the 
abline(0,1,col=2)                             # theoretical quantiles 
                                              # data consistent with the theoretical distribution 
                                              # should have points falling on the straight line

qqplot(qexp(ppoints(y),1),y)                  # the same, no need to sort y 
abline(0,1,col=2)

qqplot(qexp(ppoints(y),2),y)              
abline(0,1,col=2)                             # in fact y was generated with rate=2

x<- rnorm(50)
qqnorm(x)                                     # R built-in function for comparing against 
abline(0,1,col=2)                             # normal distribution (mean=0, sd=1)
#qqplot(qnorm(ppoints(x)),x)                  # the same 
points(qnorm(ppoints(x)),sort(x),
       pch=16,col=2)                          # the same

x<- rnorm(50,mean=1,sd=1)                     # mean different from zero
qqnorm(x)                                     # points not on the straight line
abline(0,1,col=2)                             
qqnorm(x-mean(x))                             # subtract the empirical mean
abline(0,1,col=2)                             # ok

qqplot(x,y)                                   # comparing two samples
abline(0,1,col=2)                             # they clearly follow different distribution

boxplot(x,y)                                  # boxplot
plot(c(x,y),rep(0:1,c(length(x),length(y))),  # other way for comparing the two samples
     xlab="",ylab="")


# Correlation

x1 <- x2 <- x3 <- (11:30)/5                   # create artificial data for x1, x2, x3
y1 <- x1 + rnorm(20)/2                        # create artificial data for y1
y2 <- 2 - 0.05*x1 + 0.1*((x1 - 1.75))^4 +     # create artificial data for     y2
      1.25 * rnorm(20)
y3 <- (x1 - 3.85)^2 + 0.015 + rnorm(20)/4     # create artificial data for         y3
theta <- ((2 * pi) * (1:20))/20
x4 <- 10 + 4 * cos(theta)                     # create artificial data for                x4
y4 <- 10 + 4 * sin(theta) + (0.5 * rnorm(20)) # create artificial data for                y4

x<-x1;y<-y1 
#x<-x2;y<-y2 
#x<-x3;y<-y3 
#x<-x4;y<-y4
r <- round(cor(x, y), 3)                      # Pearson correlation
rho <- round(cor(rank(x), rank(y)), 3)        # Spearman rank correlation
print(c(r, rho))
plot(x, y, pch = 16)                          # scatterplot

cov(x,y)                                      # sample covariance (same as 'var(x,y)')
var(cbind(x,y))                               # sample variance and covariances
cor(x,y)                                      # Pearson correlation
cor(cbind(x,y))                               # 2 by 2 correlation 
cov(x,y)/(sd(x)*sd(y)); cor(x,y)              # relation among 'cov','cor' and 'sd'

# log transformation 

library(MASS)                                 # data frame Animals (MASS)
attach(Animals)                               
                                              # Correlation between body and brain: 
                                              
cor(body, brain)                              # Pearson correlation
cor(body, brain,method="spearman")            # Spearman rank correlation 
plot(body, brain, pch = 16)                   # scatterplot
plot(body, brain, log="xy",pch = 16)          # scatterplot at the logarithmic scale
plot(log(body), log(brain),pch = 16)          # the same with original axes's scale
detach(Animals)